Superstore Sales Dataset - https://www.kaggle.com/datasets/rohitsahoo/sales-forecasting

suppressMessages(library(tidyverse))
suppressMessages(library(maps))
suppressMessages(library(lubridate))
suppressMessages(library(viridis))
suppressMessages(library(janitor))
suppressMessages(library(scales))
suppressMessages(library(corrplot))
suppressMessages(library(RColorBrewer))
suppressMessages(library(wesanderson))
suppressMessages(library(htmltools))
suppressMessages(library(webr))
suppressMessages(library(fastDummies))

Data Cleaning

sales<-read.csv("train.csv")
head(sales,3)
##   Row.ID       Order.ID Order.Date  Ship.Date    Ship.Mode Customer.ID
## 1      1 CA-2017-152156 08/11/2017 11/11/2017 Second Class    CG-12520
## 2      2 CA-2017-152156 08/11/2017 11/11/2017 Second Class    CG-12520
## 3      3 CA-2017-138688 12/06/2017 16/06/2017 Second Class    DV-13045
##     Customer.Name   Segment       Country        City      State Postal.Code
## 1     Claire Gute  Consumer United States   Henderson   Kentucky       42420
## 2     Claire Gute  Consumer United States   Henderson   Kentucky       42420
## 3 Darrin Van Huff Corporate United States Los Angeles California       90036
##   Region      Product.ID        Category Sub.Category
## 1  South FUR-BO-10001798       Furniture    Bookcases
## 2  South FUR-CH-10000454       Furniture       Chairs
## 3   West OFF-LA-10000240 Office Supplies       Labels
##                                                  Product.Name  Sales
## 1                           Bush Somerset Collection Bookcase 261.96
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.94
## 3   Self-Adhesive Address Labels for Typewriters by Universal  14.62
dim(sales)
## [1] 9800   18
summary(sales)
##      Row.ID       Order.ID          Order.Date         Ship.Date        
##  Min.   :   1   Length:9800        Length:9800        Length:9800       
##  1st Qu.:2451   Class :character   Class :character   Class :character  
##  Median :4900   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :4900                                                           
##  3rd Qu.:7350                                                           
##  Max.   :9800                                                           
##                                                                         
##   Ship.Mode         Customer.ID        Customer.Name        Segment         
##  Length:9800        Length:9800        Length:9800        Length:9800       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Country              City              State            Postal.Code   
##  Length:9800        Length:9800        Length:9800        Min.   : 1040  
##  Class :character   Class :character   Class :character   1st Qu.:23223  
##  Mode  :character   Mode  :character   Mode  :character   Median :58103  
##                                                           Mean   :55273  
##                                                           3rd Qu.:90008  
##                                                           Max.   :99301  
##                                                           NA's   :11     
##     Region           Product.ID          Category         Sub.Category      
##  Length:9800        Length:9800        Length:9800        Length:9800       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Product.Name           Sales          
##  Length:9800        Min.   :    0.444  
##  Class :character   1st Qu.:   17.248  
##  Mode  :character   Median :   54.490  
##                     Mean   :  230.769  
##                     3rd Qu.:  210.605  
##                     Max.   :22638.480  
## 
str(sales)
## 'data.frame':    9800 obs. of  18 variables:
##  $ Row.ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Order.ID     : chr  "CA-2017-152156" "CA-2017-152156" "CA-2017-138688" "US-2016-108966" ...
##  $ Order.Date   : chr  "08/11/2017" "08/11/2017" "12/06/2017" "11/10/2016" ...
##  $ Ship.Date    : chr  "11/11/2017" "11/11/2017" "16/06/2017" "18/10/2016" ...
##  $ Ship.Mode    : chr  "Second Class" "Second Class" "Second Class" "Standard Class" ...
##  $ Customer.ID  : chr  "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
##  $ Customer.Name: chr  "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
##  $ Segment      : chr  "Consumer" "Consumer" "Corporate" "Consumer" ...
##  $ Country      : chr  "United States" "United States" "United States" "United States" ...
##  $ City         : chr  "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
##  $ State        : chr  "Kentucky" "Kentucky" "California" "Florida" ...
##  $ Postal.Code  : int  42420 42420 90036 33311 33311 90032 90032 90032 90032 90032 ...
##  $ Region       : chr  "South" "South" "West" "South" ...
##  $ Product.ID   : chr  "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
##  $ Category     : chr  "Furniture" "Furniture" "Office Supplies" "Furniture" ...
##  $ Sub.Category : chr  "Bookcases" "Chairs" "Labels" "Tables" ...
##  $ Product.Name : chr  "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
##  $ Sales        : num  262 731.9 14.6 957.6 22.4 ...
colnames(sales)
##  [1] "Row.ID"        "Order.ID"      "Order.Date"    "Ship.Date"    
##  [5] "Ship.Mode"     "Customer.ID"   "Customer.Name" "Segment"      
##  [9] "Country"       "City"          "State"         "Postal.Code"  
## [13] "Region"        "Product.ID"    "Category"      "Sub.Category" 
## [17] "Product.Name"  "Sales"

check duplicates and missing values

sum(duplicated(sales))
## [1] 0
colSums(is.na(sales))
##        Row.ID      Order.ID    Order.Date     Ship.Date     Ship.Mode 
##             0             0             0             0             0 
##   Customer.ID Customer.Name       Segment       Country          City 
##             0             0             0             0             0 
##         State   Postal.Code        Region    Product.ID      Category 
##             0            11             0             0             0 
##  Sub.Category  Product.Name         Sales 
##             0             0             0
sales[is.na(sales$Postal.Code), ]
##      Row.ID       Order.ID Order.Date  Ship.Date      Ship.Mode Customer.ID
## 2235   2235 CA-2018-104066 05/12/2018 10/12/2018 Standard Class    QJ-19255
## 5275   5275 CA-2016-162887 07/11/2016 09/11/2016   Second Class    SV-20785
## 8799   8799 US-2017-150140 06/04/2017 10/04/2017 Standard Class    VM-21685
## 9147   9147 US-2017-165505 23/01/2017 27/01/2017 Standard Class    CB-12535
## 9148   9148 US-2017-165505 23/01/2017 27/01/2017 Standard Class    CB-12535
## 9149   9149 US-2017-165505 23/01/2017 27/01/2017 Standard Class    CB-12535
## 9387   9387 US-2018-127292 19/01/2018 23/01/2018 Standard Class    RM-19375
## 9388   9388 US-2018-127292 19/01/2018 23/01/2018 Standard Class    RM-19375
## 9389   9389 US-2018-127292 19/01/2018 23/01/2018 Standard Class    RM-19375
## 9390   9390 US-2018-127292 19/01/2018 23/01/2018 Standard Class    RM-19375
## 9742   9742 CA-2016-117086 08/11/2016 12/11/2016 Standard Class    QJ-19255
##         Customer.Name     Segment       Country       City   State Postal.Code
## 2235     Quincy Jones   Corporate United States Burlington Vermont          NA
## 5275 Stewart Visinsky    Consumer United States Burlington Vermont          NA
## 8799  Valerie Mitchum Home Office United States Burlington Vermont          NA
## 9147 Claudia Bergmann   Corporate United States Burlington Vermont          NA
## 9148 Claudia Bergmann   Corporate United States Burlington Vermont          NA
## 9149 Claudia Bergmann   Corporate United States Burlington Vermont          NA
## 9387    Raymond Messe    Consumer United States Burlington Vermont          NA
## 9388    Raymond Messe    Consumer United States Burlington Vermont          NA
## 9389    Raymond Messe    Consumer United States Burlington Vermont          NA
## 9390    Raymond Messe    Consumer United States Burlington Vermont          NA
## 9742     Quincy Jones   Corporate United States Burlington Vermont          NA
##      Region      Product.ID        Category Sub.Category
## 2235   East TEC-AC-10001013      Technology  Accessories
## 5275   East FUR-CH-10000595       Furniture       Chairs
## 8799   East TEC-PH-10002555      Technology       Phones
## 9147   East TEC-AC-10002926      Technology  Accessories
## 9148   East OFF-AR-10003477 Office Supplies          Art
## 9149   East OFF-ST-10001526 Office Supplies      Storage
## 9387   East OFF-PA-10000157 Office Supplies        Paper
## 9388   East OFF-PA-10001970 Office Supplies        Paper
## 9389   East OFF-AP-10000828 Office Supplies   Appliances
## 9390   East OFF-EN-10001509 Office Supplies    Envelopes
## 9742   East FUR-BO-10004834       Furniture    Bookcases
##                                                       Product.Name   Sales
## 2235                   Logitech ClearChat Comfort/USB Headset H390  205.03
## 5275                               Safco Contoured Stacking Chairs  715.20
## 8799                           Nortel Meridian M5316 Digital phone 1294.75
## 9147                         Logitech Wireless Marathon Mouse M705   99.98
## 9148                                             4009 Highlighters    8.04
## 9149                         Iceberg Mobile Mega Data/Printer Cart 1564.29
## 9387                                                     Xerox 191   79.92
## 9388                                                    Xerox 1881   12.28
## 9389                               Avanti 4.4 Cu. Ft. Refrigerator  542.94
## 9390                                     Poly String Tie Envelopes    2.04
## 9742 Riverside Palais Royal Lawyers Bookcase, Royale Cherry Finish 4404.90

all in vermont. so we fill in the postal code 05041

sales$Postal.Code<-replace_na(sales$Postal.Code,05401)
colSums(is.na(sales))
##        Row.ID      Order.ID    Order.Date     Ship.Date     Ship.Mode 
##             0             0             0             0             0 
##   Customer.ID Customer.Name       Segment       Country          City 
##             0             0             0             0             0 
##         State   Postal.Code        Region    Product.ID      Category 
##             0             0             0             0             0 
##  Sub.Category  Product.Name         Sales 
##             0             0             0

modify dates

sales$Order.Date<-as.Date(sales$Order.Date,'%d/%m/%Y')
sales$Ship.Date<-as.Date(sales$Ship.Date, '%d/%m/%Y')
class(sales$Order.Date)
## [1] "Date"
class(sales$Ship.Date)
## [1] "Date"

EDA

cat("Number of unique Order IDs:\n")
## Number of unique Order IDs:
length(unique(sales$Order.ID))
## [1] 4922
cat("Unique Ship Mode:\n")
## Unique Ship Mode:
unique(sales$Ship.Mode)
## [1] "Second Class"   "Standard Class" "First Class"    "Same Day"
cat("Number of unique Customer IDs:\n")
## Number of unique Customer IDs:
length(unique(sales$Customer.ID))
## [1] 793
cat("Number of unique Customer name:\n")
## Number of unique Customer name:
length(unique(sales$Customer.Name))
## [1] 793
cat("Unique segment:\n")
## Unique segment:
unique(sales$Segment)
## [1] "Consumer"    "Corporate"   "Home Office"
cat("Unique country:\n")
## Unique country:
unique(sales$Country)
## [1] "United States"
cat("Number of unique Cities:\n")
## Number of unique Cities:
length(unique(sales$City))
## [1] 529
cat("Unique Region:\n")
## Unique Region:
unique(sales$Region)
## [1] "South"   "West"    "Central" "East"
#Top 5 customers by value
### assuming no customers have same name because length of unique customer.id is same as length of unique customer.name

customer_sales <- sales %>% 
    group_by(Customer.Name) %>% 
  summarise(sales=round(sum(Sales),2)) %>% 
  arrange(desc(sales))

head(customer_sales,5)
## # A tibble: 5 × 2
##   Customer.Name  sales
##   <chr>          <dbl>
## 1 Sean Miller   25043.
## 2 Tamara Chand  19052.
## 3 Raymond Buch  15117.
## 4 Tom Ashbrook  14596.
## 5 Adrian Barton 14474.
ggplot(head(customer_sales,10), aes(x=fct_reorder(Customer.Name,sales), y=sales)) +geom_col(width = 0.7, fill='skyblue', col="black") +  labs(title = " Top 10 Most Valuable Customers", y="Sales", x="Customer") + 
  theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 20))+
geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)

# mean sales amount
mean(sales$Sales)
## [1] 230.7691
#sales by order.id
sales_by_order<-sales %>% 
  group_by(Order.ID) %>% 
  summarise(sales=round(sum(Sales),2)) %>% 
  arrange(desc(sales))
# mean sales amount by order id 
mean(sales_by_order$sales)
## [1] 459.4752
# sales distribution
ggplot(sales_by_order, aes(x=sales)) + 
geom_histogram(aes(y=..density..),fill='white', color='black')+
geom_density(fill='skyblue', alpha=0.4,size=0.6) + scale_x_log10() + 
  labs(title = "Sales Distribution")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# sales date analysis
sales<-sales %>% 
  mutate(order_year=format(Order.Date,"%Y"),.after=Order.Date)

sales<-sales %>% 
  mutate(order_month=format(Order.Date,"%b"),.after=order_year)

sales<-sales %>% 
  mutate(order_day=format(Order.Date,"%a"),.after=order_month)

#Converting month and day to factors for later use
sales$order_month<-factor(sales$order_month,
                          levels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))
sales$order_day<-factor(sales$order_day,
                        levels = c("Mon","Tue","Wed","Thu","Fri","Sat","Sun"))
head(sales,3)
##   Row.ID       Order.ID Order.Date order_year order_month order_day  Ship.Date
## 1      1 CA-2017-152156 2017-11-08       2017         Nov       Wed 2017-11-11
## 2      2 CA-2017-152156 2017-11-08       2017         Nov       Wed 2017-11-11
## 3      3 CA-2017-138688 2017-06-12       2017         Jun       Mon 2017-06-16
##      Ship.Mode Customer.ID   Customer.Name   Segment       Country        City
## 1 Second Class    CG-12520     Claire Gute  Consumer United States   Henderson
## 2 Second Class    CG-12520     Claire Gute  Consumer United States   Henderson
## 3 Second Class    DV-13045 Darrin Van Huff Corporate United States Los Angeles
##        State Postal.Code Region      Product.ID        Category Sub.Category
## 1   Kentucky       42420  South FUR-BO-10001798       Furniture    Bookcases
## 2   Kentucky       42420  South FUR-CH-10000454       Furniture       Chairs
## 3 California       90036   West OFF-LA-10000240 Office Supplies       Labels
##                                                  Product.Name  Sales
## 1                           Bush Somerset Collection Bookcase 261.96
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.94
## 3   Self-Adhesive Address Labels for Typewriters by Universal  14.62
sales_by_day<-sales %>% 
    group_by(order_day) %>% 
  summarise(sales=round(sum(Sales),0)) 

ggplot(sales_by_day, aes(x=order_day,y=sales)) +
geom_col(fill="skyblue", alpha=0.7, width = 0.5, color="black") + 
labs(x="",y="Sales", title="Sales by Day") + 
theme() + theme(plot.title = element_text(hjust=0.5)) + 
scale_y_continuous(labels = comma) +
  geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)

sales_by_month<-sales %>% 
  group_by(order_month) %>% 
  summarise(sales=round(sum(Sales),0))

ggplot(sales_by_month, aes(x=order_month,y=sales)) +
geom_col(fill="skyblue", alpha=0.7, width = 0.7, color="black") + 
labs(x="",y="Sales", title="Sales by Month") + 
theme(plot.title = element_text(hjust=0.5)) +
scale_y_continuous(labels = comma)+
  geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)

unique(sales$order_year)
## [1] "2017" "2016" "2015" "2018"
sales$order_year<-factor(sales$order_year,levels = c(2015,2016,2017,2018))
sales_by_year<-sales %>% 
  group_by(order_year) %>% 
  summarise(sales=round(sum(Sales),0))

ggplot(sales_by_year, aes(x = order_year, y = sales)) +
  geom_col(fill = "skyblue", alpha = 0.7, width = 0.5, color = "black") +
  labs(x = "", y = "Sales", title = "Sales by Month") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(labels = comma) +
  geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)

ship_mode_counts <- table(sales$Ship.Mode)

pie(ship_mode_counts, labels = paste0(names(ship_mode_counts), ": ", round(prop.table(ship_mode_counts) * 100,0), "%"))

sales_by_seg <- sales %>%
  group_by(Segment) %>%
  summarise(sales = sum(Sales))
ggplot(sales_by_seg, aes(x = Segment, y = sales)) +
  geom_col(fill = 'skyblue', alpha = 0.8,width = 0.5) +
  labs(x = "Segment", y = "Sales") +
  #theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  geom_text(aes(label = sales), size = 3.5, vjust = -0.5, hjust = 0.5, angle = 0)

  coord_flip()
## <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
##     aspect: function
##     backtransform_range: function
##     clip: on
##     default: FALSE
##     distance: function
##     expand: TRUE
##     is_free: function
##     is_linear: function
##     labels: function
##     limits: list
##     modify_scales: function
##     range: function
##     render_axis_h: function
##     render_axis_v: function
##     render_bg: function
##     render_fg: function
##     setup_data: function
##     setup_layout: function
##     setup_panel_guides: function
##     setup_panel_params: function
##     setup_params: function
##     train_panel_guides: function
##     transform: function
##     super:  <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
pie(sales_by_seg$sales, labels = paste0(sales_by_seg$Segment, ": ", round(prop.table(sales_by_seg$sales) * 100, 0), "%"))

suppressMessages(library(ggthemes))

foo <- sales %>%
  group_by(State, order_month, order_year) %>% 
  summarise(sales = sum(Sales))
## `summarise()` has grouped output by 'State', 'order_month'. You can override
## using the `.groups` argument.
top_states <- foo %>%
  group_by(State) %>%
  summarise(total_sales = sum(sales)) %>%
  top_n(3, total_sales) %>%
  pull(State)

filtered_data <- foo %>%
  filter(State %in% top_states)

ggplot(filtered_data, aes(x = paste(order_month, order_year, sep = "-"), y = sales, group = State, color = State)) +
  geom_line() +
  theme_tufte() +
  labs(x = "Year", y = "Sales", color = "State") +
  scale_x_discrete(labels = function(x) gsub("-", " ", x)) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

top10_states <- foo %>%
  group_by(State) %>%
  summarise(total_sales = sum(sales)) %>%
  top_n(10, total_sales) %>%
  arrange(total_sales)

ggplot(top10_states, aes(x = reorder(State, desc(total_sales)), y = total_sales)) +
  geom_bar(stat = "identity", alpha = 0.7, fill = "pink") +
  geom_text(aes(label = total_sales), vjust = -0.5, hjust = 1, size = 3, color = "black") +
  labs(x = "State", y = "Total Sales") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_flip()+
  theme_minimal()

sales_by_region<-sales %>% 
  group_by(Region) %>% 
  summarise(sales=round(sum(Sales),0))

PieDonut(sales_by_region,aes(Region,count=sales),
         title = "Sales by Region",r0=0.6) +
        scale_fill_manual(values = c("skyblue", "grey", "pink", "coral", "purple"))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the webr package.
##   Please report the issue at <https://github.com/cardiomoon/webr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

sales_by_subcat<-sales %>% 
  group_by(Category,Sub.Category) %>% 
  summarise(sale=sum(Sales))
## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.
PieDonut(sales_by_subcat, 
         aes(Category,Sub.Category,count=sale),
         title = "Sales by Category and Sub-Category")

Correlations analysis

cat<-sales[,c("Category","Sales")]

catdummy<-dummy_cols(cat)
catdummy<-catdummy %>% 
  rename(furniture=3, office_supplies=4, technology=5)
catdummy<-catdummy[,2:5]
#convert columns data type to numeric
catdummy$furniture<-as.numeric(catdummy$furniture)
catdummy$office_supplies<-as.numeric(catdummy$office_supplies)
catdummy$technology<-as.numeric(catdummy$technology)
#create a correlation matrix
corcatdummy<-cor(catdummy)
#plotting correlation matrix
corrplot(corcatdummy, type = "upper", method = "color",
         addCoef.col = "black",
         tl.col = "black", diag = FALSE, order = "hclust")

seg<-sales[,c("Segment","Sales")]

segdummy<-dummy_cols(seg)
head(segdummy)
##     Segment    Sales Segment_Consumer Segment_Corporate Segment_Home Office
## 1  Consumer 261.9600                1                 0                   0
## 2  Consumer 731.9400                1                 0                   0
## 3 Corporate  14.6200                0                 1                   0
## 4  Consumer 957.5775                1                 0                   0
## 5  Consumer  22.3680                1                 0                   0
## 6  Consumer  48.8600                1                 0                   0
segdummy<-segdummy %>% 
  rename(consumer=3,corporate=4, home_office=5)

segdummy$consumer<-as.numeric(segdummy$consumer)
segdummy$corporate<-as.numeric(segdummy$corporate)
segdummy$home_office<-as.numeric(segdummy$home_office)
segdummy<-segdummy[,2:5]

corsegdummy<-cor(segdummy)

corrplot(corsegdummy, type = "upper", method = "color",
         addCoef.col = "black",
         tl.col = "black", diag = FALSE, order = "hclust")

ship<-sales[,c("Ship.Mode","Sales")]

shipdummy<-dummy_cols(ship)
shipdummy<-shipdummy[,2:6] %>% 
  rename(first_class=2, same_day=3, second_class=4)

shipdummy$first_class<-as.numeric(shipdummy$first_class)
shipdummy$same_day<-as.numeric(shipdummy$same_day)
shipdummy$second_class<-as.numeric(shipdummy$second_class)

corshipdummy<-cor(shipdummy)

corrplot(corshipdummy, type = "upper", method = "color",
         addCoef.col = "black",
         tl.col = "black", diag = FALSE, order = "hclust")

re<-sales[,c("Region","Sales")]

redummy<-dummy_cols(re)
redummy<-redummy[,2:6] %>% 
  mutate_all(as.numeric)

corredummy<-cor(redummy)

corrplot(corredummy, type = "upper", method = "color",
         addCoef.col = "black",
         tl.col = "black", diag = FALSE, order = "hclust")

Date plots

sales_small <- sales[c("Order.Date", "Sales")]

hist(sales_small$Order.Date, breaks = 18,
     main = "Order Date Histogram",
     xlab = "Order Date",
     ylab = "Count")

Predict sales for most valuable states (Top 3) in 2018.

foo <- sales %>%
  group_by(State, Order.Date, order_month, order_year) %>% 
  summarise(sales = sum(Sales))
## `summarise()` has grouped output by 'State', 'Order.Date', 'order_month'. You
## can override using the `.groups` argument.
top_states <- foo %>%
  group_by(State) %>%
  summarise(total_sales = sum(sales)) %>%
  top_n(3, total_sales) %>%
  pull(State)

df <- foo %>%
  filter(State %in% top_states)
dim(df)
## [1] 1455    5
df$Order.Date <- floor_date(df$Order.Date, "month")
df<- df %>%
  group_by(State, Order.Date) %>% 
  summarise(sales = sum(sales))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
ca_df <- subset(df, State == "California", select = c(Order.Date, sales))
ny_df <- subset(df, State == "New York", select = c(Order.Date, sales))
tx_df <- subset(df, State == "Texas", select = c(Order.Date, sales))
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ################################### WARNING ###################################
## # We noticed you have dplyr installed. The dplyr lag() function breaks how    #
## # base R's lag() function is supposed to work, which breaks lag(my_xts).      #
## #                                                                             #
## # Calls to lag(my_xts) that you enter or source() into this session won't     #
## # work correctly.                                                             #
## #                                                                             #
## # All package code is unaffected because it is protected by the R namespace   #
## # mechanism.                                                                  #
## #                                                                             #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## # You can use stats::lag() to make sure you're not using dplyr::lag(), or you #
## # can add conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop   #
## # dplyr from breaking base R's lag() function.                                #
## ################################### WARNING ###################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
ca_xts <- as.xts(ca_df$sales, order.by = ca_df$Order.Date)
ny_xts <- as.xts(ny_df$sales, order.by = ny_df$Order.Date)
tx_xts <- as.xts(tx_df$sales, order.by = tx_df$Order.Date)
plot(ca_xts, main = "CA Sales", ylab = "sales")

plot(ny_xts, main = "NY Sales", ylab = "sales")

plot(tx_xts, main = "TX Sales", ylab = "sales")

ca_train <- ca_xts["/2017-12-01"]
ca_test <- ca_xts["2018-01-01/"]

ny_train <- ny_xts["/2017-12-01"]
ny_test <- ny_xts["2018-01-01/"]

tx_train <- tx_xts["/2017-12-01"]
tx_test <- tx_xts["2018-01-01/"]
acf(ca_train)

acf(ny_train)

acf(tx_train)

suppressMessages(library(tseries))
adf.test(ca_train)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ca_train
## Dickey-Fuller = -2.5812, Lag order = 3, p-value = 0.347
## alternative hypothesis: stationary
kpss.test(ca_train)
## 
##  KPSS Test for Level Stationarity
## 
## data:  ca_train
## KPSS Level = 0.57412, Truncation lag parameter = 3, p-value = 0.02499
adf.test(ny_train)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ny_train
## Dickey-Fuller = -3.1615, Lag order = 3, p-value = 0.1212
## alternative hypothesis: stationary
kpss.test(ny_train)
## Warning in kpss.test(ny_train): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ny_train
## KPSS Level = 0.21396, Truncation lag parameter = 3, p-value = 0.1
adf.test(tx_train)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  tx_train
## Dickey-Fuller = -2.8367, Lag order = 3, p-value = 0.2476
## alternative hypothesis: stationary
kpss.test(tx_train)
## Warning in kpss.test(tx_train): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  tx_train
## KPSS Level = 0.066898, Truncation lag parameter = 3, p-value = 0.1

1. Use forecast::auto.arima with no seasonality to get a baseline model

suppressMessages(library(forecast))

ca_base <- auto.arima(ca_train, seasonal = FALSE)
ca_base
## Series: ca_train 
## ARIMA(0,1,0) 
## 
## sigma^2 = 40706014:  log likelihood = -356.3
## AIC=714.59   AICc=714.71   BIC=716.15
ny_base <- auto.arima(ny_train, seasonal = FALSE)
ny_base
## Series: ny_train 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##            mean
##       5905.2494
## s.e.   928.9626
## 
## sigma^2 = 31954513:  log likelihood = -361.61
## AIC=727.22   AICc=727.59   BIC=730.39
tx_base <- auto.arima(tx_train, seasonal = FALSE)
tx_base
## Series: tx_train 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##            mean
##       3476.4104
## s.e.   547.4565
## 
## sigma^2 = 11097751:  log likelihood = -342.58
## AIC=689.15   AICc=689.51   BIC=692.32

CA

a <- length(ca_test)
forecast_ca_base <- forecast(ca_base, h = a)
plot(forecast_ca_base, main = "Forecasts of CA monthly sales 2018 - baseline model")

residuals_ca <- ca_test - forecast_ca_base$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ca)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 7.4532, df = 3, p-value = 0.05877
## 
## Model df: 0.   Total lags used: 3
acc_ca_base <- accuracy(forecast_ca_base, ca_test)
acc_ca_base
##                      ME     RMSE      MAE        MPE      MAPE      MASE
## Training set   530.7376  6290.89 4743.383  -48.45317  91.80765 0.9722362
## Test set     -9503.7180 10403.43 9503.718 -107.21032 107.21032 1.9479468
##                    ACF1
## Training set -0.3233951
## Test set             NA

NY

forecast_ny_base <- forecast(ny_base, h = 12)
plot(forecast_ny_base, main = "Forecasts of NY monthly sales 2018 - baseline model")

acc_ny_base <- accuracy(forecast_ny_base, ny_test)
acc_ny_base
##                        ME     RMSE      MAE        MPE      MAPE      MASE
## Training set 1.730665e-12 5573.768 4290.426 -4573.4298 4603.0904 0.7619477
## Test set     1.909098e+03 7630.254 5732.466  -144.6164  189.5209 1.0180433
##                    ACF1
## Training set 0.02528753
## Test set             NA
residuals_ny <- ny_test - forecast_ny_base$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ny)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 5.6298, df = 3, p-value = 0.1311
## 
## Model df: 0.   Total lags used: 3

TX

forecast_tx_base <- forecast(tx_base, h = 12)
plot(forecast_tx_base, main = "Forecasts of TX monthly sales 2018 - baseline model")

acc_tx_base <- accuracy(forecast_tx_base, tx_test)
acc_tx_base
##                        ME     RMSE      MAE       MPE      MAPE      MASE
## Training set 8.716187e-13 3284.734 2054.647 -290.5260 315.97409 0.6890568
## Test set     1.420695e+02 2171.060 1540.734  -50.1806  73.59972 0.5167082
##                     ACF1
## Training set -0.08386677
## Test set              NA
residuals_tx <- tx_test - forecast_tx_base$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_tx)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 2.2786, df = 3, p-value = 0.5166
## 
## Model df: 0.   Total lags used: 3

predictions for these three are pretty bad with high RMSE.

2. Seasonal ARIMA (SARIMA) - here seasonality is monthly

CA

ca_SA <- auto.arima(ts(ca_train, frequency=12), seasonal = TRUE)
summary(ca_SA)
## Series: ts(ca_train, frequency = 12) 
## ARIMA(0,0,0)(1,1,0)[12] with drift 
## 
## Coefficients:
##          sar1     drift
##       -0.5587  128.7082
## s.e.   0.1987   58.9000
## 
## sigma^2 = 20386913:  log likelihood = -237.22
## AIC=480.44   AICc=481.64   BIC=483.97
## 
## Training set error measures:
##                     ME     RMSE     MAE       MPE     MAPE      MASE       ACF1
## Training set -239.1278 3529.684 2213.39 -16.77474 31.89948 0.5269297 -0.1784056
forecast_ca_SA <- forecast(ca_SA, h = 12)
acc_ca_SA <- accuracy(forecast_ca_SA, ca_test)
acc_ca_SA
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -239.1278 3529.684 2213.390 -16.774740 31.89948 0.4536715
## Test set     1008.1372 3058.372 2559.622   4.742513 21.85629 0.5246376
##                    ACF1
## Training set -0.1784056
## Test set             NA
plot(forecast_ca_SA, main = "Forecasts of CA monthly sales 2018 - SARIMA model")

residuals_ca1 <- ca_test - forecast_ca_SA$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ca1)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 1.4383, df = 3, p-value = 0.6966
## 
## Model df: 0.   Total lags used: 3

NY

ny_SA <- auto.arima(ts(ny_train, frequency=12), seasonal = TRUE)
summary(ny_SA)
## Series: ts(ny_train, frequency = 12) 
## ARIMA(0,0,0)(0,1,0)[12] 
## 
## sigma^2 = 16420566:  log likelihood = -233.42
## AIC=468.85   AICc=469.03   BIC=470.02
## 
## Training set error measures:
##                   ME     RMSE      MAE     MPE     MAPE      MASE       ACF1
## Training set 181.801 3308.632 1873.802 2.31772 45.75619 0.6673032 0.03047806
forecast_ny_SA <- forecast(ny_SA, h = 12)
acc_ny_SA <- accuracy(forecast_ny_SA, ny_test)
acc_ny_SA
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set  181.801 3308.632 1873.802   2.31772 45.75619 0.3327733 0.03047806
## Test set     1912.134 7451.183 5124.712 -43.14743 84.30004 0.9101108         NA
plot(forecast_ny_SA, main = "Forecasts of NY monthly sales 2018 - SARIMA model")

residuals_ny1 <- ny_test - forecast_ny_SA$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ny1)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 1.8541, df = 3, p-value = 0.6032
## 
## Model df: 0.   Total lags used: 3

TX

tx_SA <- auto.arima(ts(tx_train, frequency=12), seasonal = TRUE)
summary(tx_SA)
## Series: ts(tx_train, frequency = 12) 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##            mean
##       3476.4104
## s.e.   547.4565
## 
## sigma^2 = 11097751:  log likelihood = -342.58
## AIC=689.15   AICc=689.51   BIC=692.32
## 
## Training set error measures:
##                        ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 8.716187e-13 3284.734 2054.647 -290.526 315.9741 0.8631626
##                     ACF1
## Training set -0.08386677
forecast_tx_SA <- forecast(tx_SA, h = 12)
acc_tx_SA <- accuracy(forecast_tx_SA, tx_test)
acc_tx_SA
##                        ME     RMSE      MAE       MPE      MAPE      MASE
## Training set 8.716187e-13 3284.734 2054.647 -290.5260 315.97409 0.6890568
## Test set     1.420695e+02 2171.060 1540.734  -50.1806  73.59972 0.5167082
##                     ACF1
## Training set -0.08386677
## Test set              NA
plot(forecast_tx_SA, main = "Forecasts of TX monthly sales 2018 - SARIMA model")

residuals_tx1 <- tx_test - forecast_tx_SA$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_tx1)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 2.2786, df = 3, p-value = 0.5166
## 
## Model df: 0.   Total lags used: 3

3. Holt Winter

CA

HW.ca <- HoltWinters(ts(ca_train, frequency=12))
HW.ca
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = ts(ca_train, frequency = 12))
## 
## Smoothing parameters:
##  alpha: 0.04188232
##  beta : 1
##  gamma: 0.5762624
## 
## Coefficients:
##           [,1]
## a   12756.7598
## b     725.3795
## s1  -2804.7811
## s2  -4604.6022
## s3   4418.2333
## s4   -258.1442
## s5  -1135.5021
## s6   2491.7567
## s7    503.9543
## s8   -244.7041
## s9    489.4752
## s10 -2298.6156
## s11  4900.7352
## s12  8772.0362
forecast_HW.ca <- forecast(HW.ca, h = 12)
acc_HW.ca <- accuracy(forecast_HW.ca, ca_test)
acc_HW.ca
##                      ME     RMSE      MAE        MPE     MAPE      MASE
## Training set   772.0022 4924.683 3278.289  -5.257923 45.60812 0.6719405
## Test set     -6268.6492 7123.830 6268.649 -59.193624 59.19362 1.2848651
##                    ACF1
## Training set -0.2178956
## Test set             NA
plot(forecast_HW.ca, main = "Forecasts of CA monthly sales 2018 - HW model")

residuals_ca2 <- ca_test - forecast_HW.ca$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ca2)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 3.8892, df = 3, p-value = 0.2737
## 
## Model df: 0.   Total lags used: 3

NY

HW.ny <- HoltWinters(ts(ny_train, frequency=12))
HW.ny
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = ts(ny_train, frequency = 12))
## 
## Smoothing parameters:
##  alpha: 0.1652065
##  beta : 0
##  gamma: 1
## 
## Coefficients:
##           [,1]
## a    5767.9254
## b     137.6698
## s1  -5426.9718
## s2  -4627.4806
## s3    204.2712
## s4  -3007.2745
## s5  -1559.6880
## s6  -1928.5106
## s7  -4678.6197
## s8  -2970.9045
## s9   4191.1736
## s10 -2322.8889
## s11  1873.7533
## s12  7442.5966
forecast_HW.ny <- forecast(HW.ny, h = 12)
acc_HW.ny <- accuracy(forecast_HW.ny, ny_test)
acc_HW.ny
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -639.4068 3888.855 2584.666  -9.857669 51.36922 0.4590175
## Test set     2219.1132 7120.570 4695.783 -12.867198 63.60048 0.8339361
##                    ACF1
## Training set -0.1772572
## Test set             NA
plot(forecast_HW.ny, main = "Forecasts of NY monthly sales 2018 - HW model")

residuals_ny2 <- ny_test - forecast_HW.ny$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_ny2)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 1.556, df = 3, p-value = 0.6694
## 
## Model df: 0.   Total lags used: 3

TX

HW.tx <- HoltWinters(ts(tx_train, frequency=12))
HW.tx
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = ts(tx_train, frequency = 12))
## 
## Smoothing parameters:
##  alpha: 0
##  beta : 0
##  gamma: 0.6406015
## 
## Coefficients:
##           [,1]
## a    1122.8424
## b    -158.5682
## s1  -2513.6859
## s2  -2898.7952
## s3   2415.6664
## s4   1218.0931
## s5   -189.1532
## s6  -1453.9943
## s7    399.4519
## s8   1606.1684
## s9   4826.0672
## s10  -273.5921
## s11  3119.0982
## s12  1737.5450
forecast_HW.tx <- forecast(HW.tx, h = 12)
acc_HW.tx <- accuracy(forecast_HW.tx, tx_test)
acc_HW.tx
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  519.8805 4025.533 2456.531 -14.16475 123.2660 0.8238346
## Test set     2860.2586 3575.189 3137.229  73.46266 104.6471 1.0521170
##                    ACF1
## Training set -0.2082932
## Test set             NA
plot(forecast_HW.tx, main = "Forecasts of TX monthly sales 2018 - HW model")

residuals_tx2 <- tx_test - forecast_HW.tx$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(residuals_tx2)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 2.6453, df = 3, p-value = 0.4496
## 
## Model df: 0.   Total lags used: 3

4. arfima

CA

m41 <- forecast::arfima(ts(ca_train))

summary(m41)
## 
## Call:
##   forecast::arfima(y = ts(ca_train)) 
## 
## Coefficients:
##    Estimate Std. Error z value Pr(>|z|)    
## d 1.179e-01  3.762e-06   31336   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma[eps] = 5110.637 
## [d.tol = 0.0001221, M = 100, h = 3.788e-06]
## Log likelihood: -358.5 ==> AIC = 721.0526 [2 deg.freedom]
pred_41 <- forecast(m41, h = length(ca_test))
resid_41 <- ca_test - pred_41$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(resid_41)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 7.7712, df = 3, p-value = 0.05099
## 
## Model df: 0.   Total lags used: 3
mean(resid_41)
## [1] 2985.732

NY

m42 <- forecast::arfima(ts(ny_train))

summary(m42)
## 
## Call:
##   forecast::arfima(y = ts(ny_train)) 
## 
## Coefficients:
##    Estimate Std. Error z value Pr(>|z|)    
## d 3.517e-03  3.794e-06   927.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma[eps] = 5573.727 
## [d.tol = 0.0001221, M = 100, h = 3.821e-06]
## Log likelihood: -361.6 ==> AIC = 727.2199 [2 deg.freedom]
pred_42 <- forecast(m42, h = length(ny_test))
resid_42 <- ny_test - pred_42$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(resid_42)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 5.6349, df = 3, p-value = 0.1308
## 
## Model df: 0.   Total lags used: 3
mean(resid_42)
## [1] 1901.498

TX

m43 <- forecast::arfima(ts(tx_train))

summary(m43)
## 
## Call:
##   forecast::arfima(y = ts(tx_train)) 
## 
## Coefficients:
##    Estimate Std. Error z value Pr(>|z|)    
## d 4.583e-05  3.595e-06   12.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma[eps] = 3284.751 
## [d.tol = 0.0001221, M = 100, h = 3.62e-06]
## Log likelihood: -342.6 ==> AIC = 689.1481 [2 deg.freedom]
pred_43 <- forecast(m43, h = length(tx_test))
resid_43 <- tx_test - pred_43$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(resid_43)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 2.2786, df = 3, p-value = 0.5166
## 
## Model df: 0.   Total lags used: 3
mean(resid_43)
## [1] 142.0573

Predict overall sales (all states aggregated)

agg_sales <- sales %>%
  group_by(Order.Date) %>% 
  summarise(sales = sum(Sales))
drange <- seq(as.Date("2015-01-03"), as.Date("2018-12-30"), by="day")

dall <- data.frame(Order.Date = drange)

dall <- merge(agg_sales, dall, by=c("Order.Date"), all.y=TRUE)
dim(dall[is.na(dall$sales),])
## [1] 228   2
dim(dall)
## [1] 1458    2
sales_xts <- as.xts(dall$sales, order.by = dall$Order.Date)

there are missing dates in original dataset. so after merging the dataset with the full date rage, there are missing values. now we fill them with 0. (because no sales occurred that day)

sales_xts <- na.fill(sales_xts, 0)
sum(is.na(sales_xts))
## [1] 0
plot(sales_xts, main = "Total Sales by Date", ylab = "sales")

train <- sales_xts["/2018-11-30"]
test <- sales_xts["2018-12-01/"]
dim(train)
## [1] 1428    1
dim(test)
## [1] 30  1
acf(train)

adf.test(train)
## Warning in adf.test(train): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train
## Dickey-Fuller = -8.3022, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(train)
## Warning in kpss.test(train): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train
## KPSS Level = 2.3848, Truncation lag parameter = 7, p-value = 0.01

1. forecast::auto.arima

base <- auto.arima(train, seasonal = FALSE)
base
## Series: train 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.9558
## s.e.   0.0099
## 
## sigma^2 = 4593764:  log likelihood = -12970.79
## AIC=25945.58   AICc=25945.59   BIC=25956.11
a <- length(test)
forecast_base <- forecast(base, h = a)
plot(forecast_base, main = "Forecasts of Daily sales 2018 - baseline model")

2. seasonal arima

model.1 <- auto.arima(train, seasonal = TRUE)
summary(model.1)
## Series: train 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.9558
## s.e.   0.0099
## 
## sigma^2 = 4593764:  log likelihood = -12970.79
## AIC=25945.58   AICc=25945.59   BIC=25956.11
## 
## Training set error measures:
##                   ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set 46.6345 2141.805 1380.525 -Inf  Inf 0.7746933 0.03413206
m1.forecast <- forecast(model.1, h=a)

plot(m1.forecast, main = "Forecasts of daily sales 2018 - sarima")

resid_m1 <- residuals(model.1)

mean(resid_m1) 
## [1] 46.6345
checkresiduals(resid_m1)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 29.71, df = 10, p-value = 0.0009555
## 
## Model df: 0.   Total lags used: 10

3. holt winters

hw <-  HoltWinters(train, gamma = FALSE, seasonal = 'additive')
hw
## Holt-Winters exponential smoothing with trend and without seasonal component.
## 
## Call:
## HoltWinters(x = train, gamma = FALSE, seasonal = "additive")
## 
## Smoothing parameters:
##  alpha: 0.103466
##  beta : 0.02833581
##  gamma: FALSE
## 
## Coefficients:
##        [,1]
## a 3956.2250
## b   22.1407
hw_forecast <-  forecast(hw, h = length(test))
resid_hw <- test - hw_forecast$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
plot(hw_forecast, main = "Forecasts of daily sales 2018 - HoltWinters")

plot(hw_forecast$mean, main = "Forecasts of daily sales 2018(zoom in) - HoltWinters")

checkresiduals(resid_hw)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 5.5824, df = 6, p-value = 0.4716
## 
## Model df: 0.   Total lags used: 6

4. ARFIMA

m4 <- forecast::arfima(ts(train))
AIC(m4)
## [1] 25958.21
summary(m4)
## 
## Call:
##   forecast::arfima(y = ts(train)) 
## 
## Coefficients:
##        Estimate Std. Error z value Pr(>|z|)    
## d       0.38048    0.05255   7.241 4.47e-13 ***
## ar.ar1  0.40244    0.06000   6.707 1.98e-11 ***
## ma.ma1  0.70450    0.06126  11.500  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma[eps] = 2136.335 
## [d.tol = 0.0001221, M = 100, h = 0.0001367]
## Log likelihood: -1.298e+04 ==> AIC = 25958.21 [4 deg.freedom]
pred_ARFIMA <- forecast(m4, h = length(test))

resid_arf <- test - pred_ARFIMA$mean
## Warning: Incompatible methods ("Ops.xts", "Ops.ts") for "-"
checkresiduals(resid_arf)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 6.5911, df = 6, p-value = 0.3603
## 
## Model df: 0.   Total lags used: 6
mean(resid_arf)
## [1] 0.6193146
plot(pred_ARFIMA, main = "Forecasts of daily sales 2018 - ARFIMA")

plot(pred_ARFIMA$mean, main = "Forecasts of daily sales 2018(zoom in) - ARFIMA")